How complete is the CDC's COVID-19 case surveillance and death certificate data for deaths by race/ethnicity at the state and county levels?

Feb 3, 2021

In [ ]:
#@title
import pandas as pd
import altair as alt
from vega_datasets import data

from google.colab import auth
auth.authenticate_user()

# Turn off the three-dot menu for Altair/Vega charts.
alt.renderers.set_embed_options(actions=False)
#%load_ext google.colab.data_table
In [2]:
#@title
def FieldAnalysis(project_id, table, field_list):
  dict = {}
  for field in field_list:
      dict[field] = [0.0, 0.0, 0.0, 0.0]
  unknowns = pd.DataFrame(dict, index=['Unknown', 'Missing', 'NA', 'Known'])
  field_series = []
  value_series = []
  percent_series = []

  for field in field_list:
    field_unknowns_query = ('''
    SELECT
      %s,
      count(*) as cases
    FROM
      %s
    WHERE
      death_yn = 'Yes'
    GROUP BY
      %s
    ''')
    query = field_unknowns_query % (field, table, field)
    field_unknowns_df = pd.io.gbq.read_gbq(query, project_id=project_id)
    field_unknowns_df.set_index(field, inplace=True)
    field_unknowns_df.index = field_unknowns_df.index.fillna('Null')

    field_display_name = {
        'cdc_case_earliest_dt': 'CDC earliest case date',
        'current_status': 'Case status',
        'res_state': 'State',
        'res_county': 'County',
        'sex': 'Sex',
        'age_group': 'Age',
        'race_ethnicity_combined': 'Race/Ethnicity'}

    missing_count = 0
    if 'Missing' in field_unknowns_df.index:
      missing_count += field_unknowns_df.loc['Missing'].cases
    if 'Null' in field_unknowns_df.index:
      missing_count += field_unknowns_df.loc['Null'].cases
    #if field_unknowns_df.index.isnull().any():
    #  missing_count += field_unknowns_df.loc[field_unknowns_df.index.isnull()].cases
    unknowns.loc['Missing', field] = missing_count / field_unknowns_df.cases.sum()

    if 'Unknown' in field_unknowns_df.index:
      unknowns.loc['Unknown', field] = field_unknowns_df.loc['Unknown'].cases / field_unknowns_df.cases.sum()
    if 'NA' in field_unknowns_df.index:
      unknowns.loc['NA', field] = field_unknowns_df.loc['NA'].cases / field_unknowns_df.cases.sum()
    unknowns.loc['Known', field] = 1 - (unknowns.loc['Missing', field] +
                                        unknowns.loc['Unknown', field] +
                                        unknowns.loc['NA', field])
    field_series.extend([field_display_name.get(field, field)] * 4)
    value_series.extend(['Known', 'Supressed', 'Unknown', 'Missing'])
    percent_series.extend([unknowns.loc['Known', field],
                           unknowns.loc['NA', field],
                           unknowns.loc['Unknown', field],
                           unknowns.loc['Missing', field]])
  test = pd.DataFrame.from_dict({'field': field_series,
                               'value': value_series,
                               'percent': percent_series})
  return alt.Chart(test).mark_bar().encode(
      x=alt.X('percent', axis=alt.Axis(format='%'), title=''),
      y=alt.Y('field', sort='x', title='Field'),
      color=alt.Color('value', scale=alt.Scale(scheme='category20'), title='Value'),
      order=alt.Order('field:N'),
      tooltip=[
                  alt.Tooltip('field:N', title='Field'),
                  alt.Tooltip('value:N', title='Value'),
                  alt.Tooltip('percent:Q', format=',.0%', title='Percent'),
      ]
  )
In [3]:
#@title
CASES = 'Cases'
DATASET = 'cdc'
metric = CASES

project_id = 'msm-secure-data-1b'
cdc_table = '`%s.ndunlap_secure.cdc_restricted_access_20201231`' % project_id
date = 'DATE(2020, 12, 16)'
date_int = '20201216'
date_display_name = 'Dec 16'

# Chart settings.
total_cases_scale_max = 40000
scatter_height = 300
scatter_width = 300
map_height = 300
map_width = 450
us_states = alt.topo_feature(data.us_10m.url, 'states')
us_counties = alt.topo_feature(data.us_10m.url+"#", 'counties')

territories = ('PR', 'GU', 'VI', 'MP', 'AS')
race_ethnicity_combined_map = {
    'Asian, Non-Hispanic': 'asian_cases',
    'Black, Non-Hispanic': 'black_cases',
    'White, Non-Hispanic': 'white_cases',
    'American Indian/Alaska Native, Non-Hispanic': 'aian_cases',
    'Hispanic/Latino': 'hispanic_cases',
    'Multiple/Other, Non-Hispanic': 'other_cases',
    'Native Hawaiian/Other Pacific Islander, Non-Hispanic': 'nhpi_cases',
    'Missing': 'unknown_cases',
    'Unknown': 'unknown_cases',
    'NA': 'na_cases',
}

states_to_fips = {'AL': 1, 'AK': 2, 'AZ': 4, 'AR': 5, 'AS': 3, 'CA': 6, 'CO': 8, 'CT': 9, 'DC': 11, 'DE': 10, 'FL': 12, 'GA': 13, 'GU': 14, 'HI': 15, 'ID': 16, 'IL': 17, 'IN': 18, 'IA': 19, 'KS': 20, 'KY': 21, 'LA': 22, 'ME': 23, 'MD': 24, 'MA': 25, 'MI': 26, 'MN': 27, 'MS': 28, 'MO': 29, 'MT': 30, 'NE': 31, 'NV': 32, 'NH': 33, 'NJ': 34, 'NM': 35, 'NY': 36, 'NYC': 36, 'NC': 37, 'ND': 38, 'OH': 39, 'OK': 40, 'OR': 41, 'PA': 42, 'PR': 43, 'RI': 44, 'SC': 45, 'SD': 46, 'TN': 47, 'TX': 48, 'UT': 49, 'VT': 50, 'VA': 51, 'VI': 52, 'WA': 53, 'WV': 54, 'WI': 55, 'WY': 56, 'AS': 60, 'GU': 66, 'MP': 69, 'PR': 72, 'VI': 78}
states_to_abbreviations = {'Alabama': 'AL', 'Alaska': 'AK', 'Arizona': 'AZ', 'Arkansas': 'AR', 'California': 'CA', 'Colorado': 'CO', 'Connecticut': 'CT', 'Delaware': 'DE', 'District of Columbia': 'DC', 'Florida': 'FL', 'Georgia': 'GA', 'Hawaii': 'HI', 'Idaho': 'ID', 'Illinois': 'IL', 'Indiana': 'IN', 'Iowa': 'IA', 'Kansas': 'KS', 'Kentucky': 'KY', 'Louisiana': 'LA', 'Maine': 'ME', 'Maryland': 'MD', 'Massachusetts': 'MA', 'Michigan': 'MI', 'Minnesota': 'MN', 'Mississippi': 'MS', 'Missouri': 'MO', 'Montana': 'MT', 'Nebraska': 'NE', 'Nevada': 'NV', 'New Hampshire': 'NH', 'New Jersey': 'NJ', 'New Mexico': 'NM', 'New York': 'NY', 'North Carolina': 'NC', 'North Dakota': 'ND', 'Ohio': 'OH', 'Oklahoma': 'OK', 'Oregon': 'OR', 'Pennsylvania': 'PA', 'Rhode Island': 'RI', 'South Carolina': 'SC', 'South Dakota': 'SD', 'Tennessee': 'TN', 'Texas': 'TX', 'Utah': 'UT', 'Vermont': 'VT', 'Virginia': 'VA', 'Washington': 'WA', 'West Virginia': 'WV', 'Wisconsin': 'WI', 'Wyoming': 'WY'}
In [4]:
#@title
crdt_query = ('''
SELECT
  State as state,
  Deaths_Total as crdt_cases,
  Deaths_Total - Deaths_Unknown as crdt_known_race_cases,
  ROUND(1 - Deaths_Unknown / Deaths_Total, 4) as crdt_known_race_cases_percent,
FROM `msm-secure-data-1b.ndunlap_secure.crdt_20210203`
WHERE
  date = %s
''' % date_int)

nyt_states_query = ('''
SELECT
  state_name,
  state_fips_code,
  deaths as nyt_cases,
  deaths as nyt_deaths
FROM `bigquery-public-data.covid19_nyt.us_states`
WHERE
  date = %s AND
  state_fips_code IS NOT NULL
''' % date)

nyt_counties_query = ('''
SELECT
  county_fips_code,
  deaths as nyt_cases,
FROM `bigquery-public-data.covid19_nyt.us_counties`
WHERE
  date = %s AND
  county_fips_code IS NOT NULL
''' % date)

cdc_states_query = ('''
SELECT
  res_state,
  COUNT(*) as cdc_cases
FROM
  %s
WHERE
  death_yn = 'Yes'
GROUP BY
   res_state
''' % cdc_table)

cdc_counties_query = ('''
SELECT
  res_state,
  res_county,
  race_ethnicity_combined,
  COUNT(*) as cases
FROM
  %s
WHERE
  death_yn = 'Yes'
GROUP BY
   res_county,
   res_state,
   race_ethnicity_combined
''' % cdc_table)

compare_cases_unknowns_query = ('''
SELECT
  res_state,
  race_ethnicity_combined,
  COUNT(*) as cdc_cases
FROM
  %s
WHERE
  death_yn = 'Yes'
GROUP BY
   res_state,
   race_ethnicity_combined
''' % cdc_table)

cdc_states_by_month_query = ('''
SELECT
  res_state,
  CONCAT(EXTRACT(YEAR from cdc_case_earliest_dt), '-Q', EXTRACT(QUARTER from cdc_case_earliest_dt)) as date,
  COUNT(*) as total_cases,
FROM
  %s
WHERE
  death_yn = 'Yes' AND
  cdc_case_earliest_dt >= DATE(2020, 1, 1) AND
  cdc_case_earliest_dt < DATE(2020, 12, 1) AND
  res_state in ('AK', 'CA', 'CT', 'DE', 'GA', 'LA', 'MD', 'ND', 'NY', 'PA', 'RI')
GROUP BY
   1, 2
ORDER BY
   1, 2
''' % cdc_table)

cdc_states_by_month_known_or_na_query = ('''
SELECT
  res_state,
  CONCAT(EXTRACT(YEAR from cdc_case_earliest_dt), '-Q', EXTRACT(QUARTER from cdc_case_earliest_dt)) as date,
  COUNT(*) as known_or_na_cases,
FROM
  %s
WHERE
  death_yn = 'Yes' AND
  cdc_case_earliest_dt >= DATE(2020, 1, 1) AND
  cdc_case_earliest_dt < DATE(2020, 12, 1) AND
  race_ethnicity_combined != 'Unknown' AND
  race_ethnicity_combined != 'Missing'
GROUP BY
   1, 2
ORDER BY
   1, 2
''' % cdc_table)

cdc_overall_query = ('''
SELECT
  race_ethnicity_combined,
  COUNT(*) as cases
FROM
  %s
WHERE
  death_yn = 'Yes'
GROUP BY
   1
''' % cdc_table)

Background

The CDC Case Surveillance data about who died from COVID-19 has issues similar to those discussed in the case data completeness analysis.

  • Only 80% of total deaths up to Dec 16 are included (60K deaths are missing)
  • Of those deaths, 78% have known race/ethnicity (52K out of 239K cases are missing race/ethnicity)

For the 187M cases where we do know race/ethnicity, we can see the following disparities across race/ethnicity groups:

In [5]:
#@title
overall_df = pd.io.gbq.read_gbq(cdc_overall_query, project_id=project_id)
overall_df['race_ethnicity_combined'] = overall_df.race_ethnicity_combined.astype('string').str.strip()
overall_df = overall_df.replace(to_replace={'race_ethnicity_combined': race_ethnicity_combined_map})
overall_df = overall_df.set_index('race_ethnicity_combined')

chart_denominator = 1000
cases_list = [overall_df.cases['hispanic_cases'] / chart_denominator,
         overall_df.cases['black_cases'] / chart_denominator,
         overall_df.cases['white_cases'] / chart_denominator,
         overall_df.cases['asian_cases'] / chart_denominator,
         overall_df.cases['nhpi_cases'] / chart_denominator,
         overall_df.cases['aian_cases'] / chart_denominator,
         overall_df.cases.sum() / chart_denominator,
]

# Population data from https://api.census.gov/data/2019/acs/acs1/profile?get=NAME,DP05_0071E,DP05_0078E,DP05_0077E,DP05_0080E,DP05_0081E,DP05_0079E,DP05_0070E&for=us:1
pop_list = [
    60481746 / chart_denominator,
    40596040  / chart_denominator,
    196789401 / chart_denominator,
    18427914  / chart_denominator,
    565473 / chart_denominator,
    2236348 / chart_denominator,
    328239523 / chart_denominator,
]
percent_list = []
for i in range(len(cases_list)):
  percent_list.append(cases_list[i] / pop_list[i])
prevalence = pd.DataFrame.from_dict({'group': [
    'Hispanic/Latino',
    'Black',
    'White',
    'Asian',
    'Native Hawaiian/Pacific Islander',
    'American Indian/Alaska Native',
    '*Total Including Unknowns*',
], 'percent': percent_list,
   'cases': cases_list,
   'population': pop_list,
})

bars = alt.Chart(prevalence).mark_bar().encode(
      x=alt.X('percent', axis=alt.Axis(format='.2%'), title=''),
      y=alt.Y('group', sort='-x', title=''),
      color=alt.Color('group', 
                      scale=alt.Scale(scheme='tableau20'),
                      title='',
                      legend=None),
      tooltip=[
                  alt.Tooltip('group:N', title='Race/Ethnicity Group'),
                  alt.Tooltip('percent:Q', format='.2%', title='Percent who died'),
                  alt.Tooltip('cases:Q', format=',.2f', title='Deaths in group (thousands)'),
                  alt.Tooltip('population:Q', format=',.0f', title='Population of group (thousands)'),
      ]
).properties(
   title='Percent of Race/Ethnicity Group who died from COVID-19 based on Incomplete CDC Data as of %s' % 'Dec 16'
)

bars.display()

#alt.concat(bars).properties(
#    title=alt.TitleParams(
#        ['Source: U.S. Census Bureau\'s American Community Survey 2019 5-year estimates for population data.'],
#        baseline='bottom',
#        dy=20,
#        orient='bottom',
#        fontWeight='normal',
#        fontSize=11
#    )
#).display()

But the chart above is based on incomplete data. For example, the CDC data says that 0% of deaths in California were Hispanic/Latino people, whereas the California public health website reports that Hispanics/Latinos made up 46% of deaths (17K people) as of Jan 27. If we added 15K Hispanic/Latino deaths from California, the total percentage of Hispanic/Latinos who died from COVID-19 would increase from 0.04% to 0.07%.

If all 54K deaths with missing race/ethnicity were Hispanic/Latino people, the percent of Hispanic/Latinos in the U.S. who died from COVID-19 would go from 0.04% to 0.13% — a 3x increase. If all 54K deaths with missing race/ethnicity were Black people, the percent of Black people who died from COVID-19 would go from 0.08% to 0.20% — a 2.5x increase. While both of these scenarios are unlikely, they show why missing race/ethnicity data is preventing us from fully understanding and addressing the disparities in the COVID-19 pandemic in the U.S.

Overview

The goal of this analysis is to assess the completeness of the CDC's Restricted Access data and its feasibility in examining disparities in race/ethnicity for COVID-19 deaths at the state and county levels. For a case data analysis with more background information, see this report.

The top-level data completeness findings are:

  1. Data Overview: The field indicating if the person died or not is only known for 48% of cases. Race/ethnicity was known for 78% of cases where the person died, as opposed to 98%-100% for all the other fields below.
In [6]:
#@title
field_list = ['cdc_case_earliest_dt', 'current_status', 'res_state', 'res_county', 'sex', 'age_group', 'race_ethnicity_combined']
project_id = 'msm-secure-data-1b'
table = '`msm-secure-data-1b.ndunlap_secure.cdc_restricted_access_20201231`'
FieldAnalysis(project_id, table, field_list).display()
  1. Total Case Counts: The CDC data contains 80% of the total cases reported in more reliable data sources on the same data. There's high variability at the state level with 10 states reporting fewer than 10% of cases and several reporting 0%.
  2. Cases with Race/Ethnicity: Race/ethnicity data is available for 78% of cases in the CDC data compared to 93% in the Covid Racial Data Tracker. The Covid Racial Tracker has less variability with all but one state reporting at least 70% of deaths with race/ethnicity.

We will also evaluate a better source for deaths data that comes from death certificates, as reported in the CDC Provisional Deaths State data, the CDC Provisional Deaths County data, and on this CDC dashboard. The CDC Provisional Deaths data is more complete within each county it reports than the case data, but the Provisional Deaths County data only contains data for counties with 100 or more deaths, which was 644 counties on Feb 3 vs. 2,328 counties in the CDC case data on Dec 16. The provisional deaths data is an excellent alternative to the case data at the state level and a good alternative at the county level if you are only interested in counties with high populations instead of a comprehensive view of deaths that includes areas with smaller populations.

What we didn't include in this report:

Completeness Analysis

Total Death Counts

Baseline: NYT vs. CRDT

To get a baseline of how much we could expect the CDC death counts to match the CRDT or NYT, we can see how closely the CRDT and NYT match each other. Each dot below is a state (hover to see details), and the black line shows where the NYT and CRDT death counts are equal.

In [7]:
#@title
def CreateScatterPlot(
    chart_df, fields_dict, title, scale_max, height, width, geo, metric_type):
  
  geo_field = 'state'
  geo_field_display_name = 'State'
  if geo == 'county':
    geo_field = 'state_county'
    geo_field_display_name = 'County'

  if metric_type == 'ratio':
    scale_scheme = 'blueorange'
    scale_reverse = True
    scale_domain = [0, 2]
    legend_format = '.1f'
    axis_format = ',.0f'
  elif metric_type == 'percent':
    scale_scheme = 'redyellowblue'
    scale_reverse = False
    scale_domain = [0, 1]
    legend_format = '.0%'
    axis_format = '.0%'

  tooltips = [alt.Tooltip(geo_field + ':N', title=geo_field_display_name)]
  for field in ('y', 'x', 'percent'):
    tooltips.append(alt.Tooltip(
        fields_dict[field]['name'] + ':Q',
        format=fields_dict[field]['format'],
        title=fields_dict[field]['title'],
    ))
  plot = alt.Chart(chart_df).mark_circle(size=60).encode(
      alt.X(fields_dict['x']['name'] + ':Q', axis=alt.Axis(title=fields_dict['x']['title'], format=axis_format),
          scale=alt.Scale(domain=(0, scale_max))
      ),
      alt.Y(fields_dict['y']['name'] + ':Q', axis=alt.Axis(title=fields_dict['y']['title'], format=axis_format),
          scale=alt.Scale(domain=(0, scale_max))
      ),
      color=alt.Color(fields_dict['percent']['name'],
                      type='quantitative',
                      scale=alt.Scale(scheme=scale_scheme,
                                      reverse=scale_reverse,
                                      domain=scale_domain,
                                      clamp=True),
                      legend=alt.Legend(format=legend_format),
                      title=metric_type.capitalize()),
      tooltip=tooltips,
  ).properties(
      height=height,
      width=width,
  )
  if metric_type == 'ratio':
    plot.interactive()

  line = pd.DataFrame({
      'x': [0, scale_max],
      'y': [0, scale_max],
  })

  if metric_type == 'ratio':
    line_plot = alt.Chart(line).mark_line(color='black').encode(
        x='x',
        y='y',
    )
  elif metric_type == 'percent':
    line_plot = (
        alt.Chart(pd.DataFrame({'x': [.5]})).mark_rule().encode(y='x') +
        alt.Chart(pd.DataFrame({'y': [.5]})).mark_rule().encode(x='y')
    )
  # Add interative for concatenating due to https://github.com/altair-viz/altair/issues/2010.
  scatter = (plot + line_plot).properties(
      title=title,
      height=height,
      width=width,
  ).interactive()
  return scatter

def CreateMap(
    chart_df, fields_dict, title, scale_max, height, width, geo, metric_type):
  
  geo_field = 'state'
  geo_field_display_name = 'State'
  fips_code = 'state_fips_code'
  topo_feature = us_states
  if geo == 'county':
    geo_field = 'state_county'
    geo_field_display_name = 'County'
    fips_code = 'county_fips'
    topo_feature = us_counties

  if metric_type == 'ratio':
    scale_scheme = 'blueorange'
    scale_reverse = True
    scale_domain = [0, 2]
    legend_format = '.1f'
  elif metric_type == 'percent':
    scale_scheme = 'redyellowblue'
    scale_reverse = False
    scale_domain = [0, 1]
    legend_format = '.0%'

  highlight = alt.selection_single(on='mouseover', fields=['id', fips_code], empty='none')
  tooltips = [alt.Tooltip(geo_field + ':N', title=geo_field_display_name)]
  for field in ('y', 'x', 'percent'):
    tooltips.append(alt.Tooltip(
        fields_dict[field]['name'] + ':Q',
        format=fields_dict[field]['format'],
        title=fields_dict[field]['title'],
    ))

  field_names = [geo_field]
  field_names.extend([fields_dict[field]['name'] for field in fields_dict])
  plot = alt.Chart(topo_feature).mark_geoshape(
        stroke='white',
        strokeOpacity=.2,
        strokeWidth=1
    ).project(
      type='albersUsa'
    ).transform_lookup(
        lookup='id',
        from_=alt.LookupData(chart_df, fips_code, field_names)
    ).encode(
        alt.Color(fields_dict['percent']['name'],
                  type='quantitative',  
                  legend=alt.Legend(format=legend_format),
                  scale=alt.Scale(scheme=scale_scheme,
                                  reverse=scale_reverse,
                                  domain=scale_domain,
                                  clamp=True,
                                  ),
                  title=metric_type.capitalize()),
         tooltip=tooltips
    ).add_selection(
        highlight,
    )

  states_outline = alt.Chart(us_states).mark_geoshape(stroke='white', strokeWidth=1.5, fillOpacity=0, fill='white').project(
        type='albersUsa'
  )

  states_fill = alt.Chart(us_states).mark_geoshape(
        fill='silver',
        stroke='white'
  ).project('albersUsa')

  layered_map = alt.layer(states_fill, plot, states_outline).properties(
        height=height,
        width=width,
        title=title,
  )
  return layered_map

def CreateScatterPlotAndMap(
    chart_df, fields_dict, title, total_cases_scale_max, scatter_height, scatter_width, map_width, geo, metric_type):
  scatter = CreateScatterPlot(
    chart_df, fields_dict, title, total_cases_scale_max, scatter_height, scatter_width, geo, metric_type)
  map = CreateMap(
    chart_df, fields_dict, title, total_cases_scale_max, scatter_height, map_width, geo, metric_type)
  return (scatter | map).configure_view(
       strokeWidth=0,
   ).configure_mark(
       stroke='grey'
   ).configure_legend(
       gradientLength=scatter_height - 50
   )

def PrintSummaryStats(chart_df, field='percent'):
  below_15 = len(chart_df[chart_df[field] < .85]) / len(chart_df)
  above_15 = len(chart_df[chart_df[field] > 1.15]) / len(chart_df)
  print('between +/-15%: ', round(1 - below_15 - above_15, 2))
  below_50 = len(chart_df[chart_df[field] < .5]) / len(chart_df)
  above_50 = len(chart_df[chart_df[field] > 1.5]) / len(chart_df)
  print('between +/-50%: ', round(1 - below_50 - above_50, 2))
  print('< than .50: ', len(chart_df[chart_df[field] < .5]))
  print('> than 1.50: ', len(chart_df[chart_df[field] > 1.5]))
  print(chart_df[field].describe())
In [8]:
#@title
crdt_df = pd.io.gbq.read_gbq(crdt_query, project_id=project_id)
crdt_df.set_index('state', inplace=True)

nyt_states_df = pd.io.gbq.read_gbq(nyt_states_query, project_id=project_id)
nyt_states_df.state_fips_code.unique()
nyt_territories = ('Puerto Rico', 'Guam', 'Virgin Islands', 'Northern Mariana Islands', 'American Samoa')
for territory in nyt_territories:
  nyt_states_df = nyt_states_df[nyt_states_df.state_name != territory]
nyt_states_df['state_fips_code'] = nyt_states_df.state_fips_code.astype(int)
nyt_states_df.set_index('state_fips_code', inplace=True)

crdt_df.reset_index(inplace=True)
crdt_df['state_fips_code'] = crdt_df.state
crdt_df = crdt_df.replace(to_replace={'state_fips_code': states_to_fips})
crdt_df.set_index('state_fips_code', inplace=True)
nyt_crdt_merged_df = nyt_states_df.join(crdt_df, on="state_fips_code", how='inner', lsuffix='_left', rsuffix='_right')

nyt_crdt_merged_df['percent'] = round(nyt_crdt_merged_df.nyt_cases / nyt_crdt_merged_df.crdt_cases, 2)
nyt_crdt_merged_df
nyt_crdt_merged_df.reset_index(inplace=True)


below_15 = len(nyt_crdt_merged_df[nyt_crdt_merged_df.percent < .85]) / len(nyt_crdt_merged_df)
above_15 = len(nyt_crdt_merged_df[nyt_crdt_merged_df.percent > 1.15]) / len(nyt_crdt_merged_df)
#print('between +/-15%: ', round(1 - below_15 - above_15, 2))
#nyt_crdt_merged_df.percent.describe()
In [9]:
#@title
nyt_crdt_fields_dict = {
    'x': {'name': 'crdt_cases', 'format': ',', 'title': 'CRDT deaths'},
    'y': {'name': 'nyt_cases', 'format': ',', 'title': 'NYT deaths'},
    'percent': {'name': 'percent', 'format': '.2f', 'title': 'Ratio of NYT to CRDT'},
}
nyt_crdt_title = 'Ratio of NYT to CRDT Deaths by State as of %s' % date_display_name

CreateScatterPlotAndMap(
    nyt_crdt_merged_df, nyt_crdt_fields_dict, nyt_crdt_title, total_cases_scale_max, scatter_height, scatter_width, map_width, 'state', 'ratio'
).display()

States: CDC vs. CRDT

We can see below that the CDC death counts differ from the CRDT death counts much more drastically than the NYT did.

In [10]:
#@title
cdc_states_df = pd.io.gbq.read_gbq(cdc_states_query, project_id=project_id)
cdc_states_df.rename(columns={'res_state': 'state'}, inplace=True)
cdc_states_df.set_index('state', inplace=True)

crdt_df = pd.io.gbq.read_gbq(crdt_query, project_id=project_id)

for territory in territories:
  crdt_df = crdt_df[crdt_df.state != territory]

crdt_df.set_index('state', inplace=True)
cdc_crdt_merged_df = cdc_states_df.join(crdt_df, on="state", how='right', lsuffix='_left', rsuffix='_right')
cdc_crdt_merged_df.reset_index(inplace=True)
cdc_crdt_merged_df['state_fips_code'] = cdc_crdt_merged_df.state
cdc_crdt_merged_df = cdc_crdt_merged_df.replace(to_replace={'state_fips_code': states_to_fips})
cdc_crdt_merged_df['percent'] = round(cdc_crdt_merged_df.cdc_cases / cdc_crdt_merged_df.crdt_cases, 4)

# PrintSummaryStats(cdc_crdt_merged_df)
In [11]:
#@title
# CDC vs. NYT county

df = pd.io.gbq.read_gbq(cdc_counties_query, project_id=project_id)
for territory in territories:
  df = df[df.res_state != territory]

project_id = 'msm-secure-data-1b'
df_county_fips_map = pd.io.gbq.read_gbq(f'''
SELECT
*
FROM
  `msm-secure-data-1b.ndunlap_secure.county_fips_mapping`
''', project_id=project_id)

df_county_fips_map.cdc_county = df_county_fips_map.cdc_county.str.lower()
df_county_fips_map['state_county'] = df_county_fips_map.state + '-' + df_county_fips_map.cdc_county
df_county_fips_map['state_county'] = df_county_fips_map.state_county.astype('string').str.strip()
df_county_fips_map.set_index('state_county', inplace=True)
In [12]:
#@title
# Concatenate the state and county names because county names are not unique across states.
df.res_county = df.res_county.str.lower()
df['state_county'] = df.res_state + '-' + df.res_county
df['state_county'] = df.state_county.astype('string').str.strip()
df.set_index('state_county', inplace=True)
df['race_ethnicity_combined'] = df.race_ethnicity_combined.astype('string').str.strip()

df = df.replace(to_replace={'race_ethnicity_combined': race_ethnicity_combined_map})
In [13]:
#@title
mismatches_df = df.join(df_county_fips_map, on="state_county", how='outer', lsuffix='_left', rsuffix='_right')
mismatches_df = mismatches_df[mismatches_df.county_fips.isna()]
mismatches_df = mismatches_df[mismatches_df.res_state != 'NA']
mismatches_df = mismatches_df[mismatches_df.res_state != 'Unknown']
mismatches_df = mismatches_df[mismatches_df.res_county != 'na']
mismatches_df = mismatches_df[mismatches_df.res_county != 'unknown']
#print(mismatches_df.cases.sum())
#print('vs. 60363 with NULL county_fips_code')
# SELECT 
#count(*) as total_cases,
#FROM `msm-secure-data-1b.ndunlap_secure.cdc_restricted_access_20201231`
#WHERE county_fips_code IS NULL
In [14]:
#@title
merged_df = df.join(df_county_fips_map, on="state_county", how='inner', lsuffix='_left', rsuffix='_right')

# Create a crosstab table with rows = counties, columns = race_ethnicity_combined.
crosstab_df = pd.crosstab(merged_df['county_fips'], merged_df.race_ethnicity_combined, values=merged_df.cases, aggfunc=sum,
                          margins=True,
                          margins_name='total_cases'
)
# Have to reset_index() to go from pandas multi-index to single index.
crosstab_df = crosstab_df.reset_index()
crosstab_df.drop(axis=0, index=len(crosstab_df) - 1, inplace=True)
crosstab_df['county_fips'] = crosstab_df.county_fips.astype(int)
crosstab_df['total_known_cases'] = crosstab_df['total_cases'] - crosstab_df.na_cases.fillna(0) - crosstab_df.unknown_cases.fillna(0)
In [15]:
#@title
# Get the display names for each county.
# Use ACS data that only has one FIPS code per county unlike the fips_county_map.
df_acs_name_lookup = pd.io.gbq.read_gbq(f'''
SELECT
  state,
  county,
  county_fips,
  total_pop
FROM
  `msm-internal-data.ipums_acs.acs_2019_5year_county`
''', project_id=project_id)

df_acs_name_lookup['state_county'] = df_acs_name_lookup.county.astype('string').str.strip() + ', ' + df_acs_name_lookup.state.astype('string').str.strip()
df_acs_name_lookup.drop(columns=['state', 'county'], inplace=True)
df_acs_name_lookup.set_index('county_fips', inplace=True)

county_chart_df = crosstab_df.join(df_acs_name_lookup, on="county_fips", how='inner', lsuffix='_left', rsuffix='_right')
county_chart_df.county_fips = county_chart_df.county_fips.astype(int)

#print(len(county_chart_df))
#print(county_chart_df.total_pop.sum())
#print(county_chart_df.total_pop.sum() / 324697795)  # Population covered in these counties
#print(0.55 * 324697795) # NYT population
In [16]:
#@title

nyt_counties_df = pd.io.gbq.read_gbq(nyt_counties_query, project_id=project_id)
nyt_counties_df.rename(columns={'county_fips_code': 'county_fips'}, inplace=True)
nyt_counties_df.county_fips.unique()
nyt_counties_df['county_fips'] = nyt_counties_df.county_fips.astype(int)
nyt_counties_df.set_index('county_fips', inplace=True)

county_chart_df.set_index('county_fips', inplace=True)
nyt_merged_df = county_chart_df.join(nyt_counties_df, on="county_fips", how='inner', lsuffix='_left', rsuffix='_right')
nyt_merged_df = nyt_merged_df.reset_index()
nyt_merged_df['percent'] = round(nyt_merged_df.total_cases / nyt_merged_df.nyt_cases, 2)

#PrintSummaryStats(nyt_merged_df)
In [17]:
#@title
cdc_crdt_fields_dict = {
    'x': {'name': 'crdt_cases', 'format': ',', 'title': 'CRDT deaths'},
    'y': {'name': 'cdc_cases', 'format': ',', 'title': 'CDC deaths'},
    'percent': {'name': 'percent', 'format': '.2f', 'title': 'Ratio of CDC to CRDT'},
}
cdc_crdt_title = 'Ratio of CDC to CRDT Deaths by State as of %s' % date_display_name

CreateScatterPlotAndMap(
    cdc_crdt_merged_df, cdc_crdt_fields_dict, cdc_crdt_title, total_cases_scale_max, scatter_height, scatter_width, map_width, 'state', 'ratio'
).display()

Counties: CDC vs. NYT

We can do the same analysis at the county level using the CDC vs. NYT data.

In [18]:
#@title
cdc_nyt_fields_dict = {
    'x': {'name': 'nyt_cases', 'format': ',', 'title': 'NYT deaths'},
    'y': {'name': 'total_cases', 'format': ',', 'title': 'CDC deaths'},
    'percent': {'name': 'percent', 'format': '.2f', 'title': 'Ratio of CDC to NYT'},
}
cdc_nyt_title = 'Ratio of CDC Deaths to NYT Deaths by County as of Dec 16'
zoom_cdc_nyt_title = 'Zoom in on counties with up to 3,000 Deaths'

scatter = CreateScatterPlot(
    nyt_merged_df, cdc_nyt_fields_dict, cdc_nyt_title, 10000, scatter_height, scatter_width, 'county', 'ratio'
)
zoom_scatter = CreateScatterPlot(
    nyt_merged_df, cdc_nyt_fields_dict, zoom_cdc_nyt_title, 3000, scatter_height, scatter_width, 'county', 'ratio'
)

(scatter | zoom_scatter).configure_view(
    strokeWidth=0,
).configure_legend(
    gradientLength=map_height - 50
).configure_mark(
    stroke='grey'
).display()
In [19]:
#@title
cdc_nyt_fields_dict = {
    'x': {'name': 'nyt_cases', 'format': ',', 'title': 'NYT deaths'},
    'y': {'name': 'total_cases', 'format': ',', 'title': 'CDC deaths'},
    'percent': {'name': 'percent', 'format': '.2f', 'title': 'Ratio of CDC to NYT'},
}
cdc_nyt_title = 'Ratio of CDC Deaths to NYT Deaths by County as of Dec 16'

cdc_nyt_map = CreateMap(
    nyt_merged_df, cdc_nyt_fields_dict, cdc_nyt_title, total_cases_scale_max, map_height, map_width, 'county', 'ratio'
)
cdc_crdt_map = CreateMap(
    cdc_crdt_merged_df, cdc_crdt_fields_dict, cdc_crdt_title, total_cases_scale_max, map_height, map_width, 'state', 'ratio'
)

(cdc_crdt_map | cdc_nyt_map).configure_view(
    strokeWidth=0,
).configure_legend(
    gradientLength=map_height - 50
).display()

Notes:

  • The legend only goes to 2.0, and all counties with a larger ratio are shown in the same dark blue color.
  • States can have cases but no counties with cases because county names can be suppressed for privacy reasons.
  • A larger version of the county map for hovering over smaller counties is available in the Appendix.

Deaths with Race/Ethnicity

States and Counties: CDC

In [20]:
#@title
states_df = pd.io.gbq.read_gbq(compare_cases_unknowns_query, project_id=project_id)
for state in ('Unknown', 'NA', 'OCONUS'):
  states_df = states_df[states_df.res_state != state]

states_df['race_ethnicity_combined'] = states_df.race_ethnicity_combined.astype('string').str.strip()
states_df = states_df.replace(to_replace={'race_ethnicity_combined': {
    'Asian, Non-Hispanic': 'cdc_known_cases',
    'Black, Non-Hispanic': 'cdc_known_cases',
    'White, Non-Hispanic': 'cdc_known_cases',
    'American Indian/Alaska Native, Non-Hispanic': 'cdc_known_cases',
    'Hispanic/Latino': 'cdc_known_cases',
    'Multiple/Other, Non-Hispanic': 'cdc_known_cases',
    'Native Hawaiian/Other Pacific Islander, Non-Hispanic': 'cdc_known_cases',
    'Missing': 'cdc_unknown_cases',
    'Unknown': 'cdc_unknown_cases',
    'NA': 'cdc_na_cases',
    }})
states_df.rename(columns={'res_state': 'state'}, inplace=True)
In [21]:
#@title
crosstab_df = pd.crosstab(states_df['state'], states_df.race_ethnicity_combined, values=states_df.cdc_cases, aggfunc=sum,
                          margins=True,
                          margins_name='cdc_cases'
)
# Have to reset_index() to go from pandas multi-index to single index.
crosstab_df = crosstab_df.reset_index()
crosstab_df.drop(axis=0, index=len(crosstab_df) - 1, inplace=True)
crosstab_df['cdc_known_or_na_cases'] = crosstab_df['cdc_cases'] - crosstab_df.cdc_unknown_cases.fillna(0)
crosstab_df['cdc_known_cases'] = crosstab_df['cdc_cases'] - crosstab_df.cdc_na_cases.fillna(0) - crosstab_df.cdc_unknown_cases.fillna(0)
crosstab_df

crdt_merged_df = crosstab_df.join(crdt_df, on="state", how='inner', lsuffix='_left', rsuffix='_right')
crdt_merged_df.reset_index(inplace=True)
crdt_merged_df['state_fips_code'] = crdt_merged_df.state
crdt_merged_df = crdt_merged_df.replace(to_replace={'state_fips_code': states_to_fips})
crdt_merged_df['cdc_known_cases_percent'] = round(crdt_merged_df.cdc_known_cases / crdt_merged_df.cdc_cases, 4)
crdt_merged_df['cdc_known_or_na_cases_percent'] = round(crdt_merged_df.cdc_known_or_na_cases / crdt_merged_df.cdc_cases, 4)
crdt_merged_df['percent'] = round(crdt_merged_df.cdc_cases / crdt_merged_df.crdt_cases, 4)
crdt_merged_df['percent_known_cases'] = round(crdt_merged_df.cdc_known_cases / crdt_merged_df.crdt_known_race_cases, 4)

crdt_merged_df_no_ny = crdt_merged_df[crdt_merged_df.state != 'NY']
#PrintSummaryStats(crdt_merged_df_no_ny)

When evaluating the percent of deaths that report on race/ethnicity in the CDC data, we also need to consider the 3% of overall deaths with race/ethnicity that were suppressed due to privacy reasons. We should give states and counties credit for reporting race/ethnicity data for those deaths even if we aren't able to use it due to privacy suppression. Below, the maps on the top left shows the percent of deaths with known race/ethnicity and the map on the top right shows the percent of deaths with known or suppressed race/ethnicity. The maps on the bottom show the same information at the county level.

In [22]:
#@title

chart_df = county_chart_df.copy(deep=True)
chart_df.reset_index(inplace=True)
chart_df.county_fips = chart_df.county_fips.astype(int)
chart_df['percent_known_cases'] = round(chart_df.total_known_cases / chart_df.total_cases, 2)
chart_df['total_known_or_na_cases'] = chart_df.total_known_cases.fillna(0) + chart_df.na_cases.fillna(0)
chart_df['percent_known_or_na_cases'] = round(chart_df.total_known_or_na_cases / chart_df.total_cases, 2)
In [23]:
#@title
cdc_known_state_fields_dict = {
    'x': {'name': 'cdc_known_cases', 'format': ',', 'title': 'Known race/ethnicity deaths'},
    'y': {'name': 'cdc_cases', 'format': ',', 'title': 'CDC deaths'},
    'percent': {'name': 'cdc_known_cases_percent', 'format': '.0%', 'title': 'Percent known deaths'},
}

cdc_known_state_title = 'CDC Deaths with Known Race/Ethnicity as of %s' % date_display_name
cdc_known_state_map = CreateMap(
    crdt_merged_df, cdc_known_state_fields_dict, cdc_known_state_title, total_cases_scale_max, map_height, map_width, 'state', 'percent'
)

cdc_known_or_na_state_fields_dict = {
    'x': {'name': 'cdc_known_or_na_cases', 'format': ',', 'title': 'Known or suppressed race/ethnicity deaths'},
    'y': {'name': 'cdc_cases', 'format': ',', 'title': 'CDC deaths'},
    'percent': {'name': 'cdc_known_or_na_cases_percent', 'format': '.0%', 'title': 'Percent known or suppressed deaths'},
}
cdc_known_or_na_state_title = 'CDC Deaths with Known+Suppressed Race/Ethnicity as of %s' % date_display_name
cdc_known_or_na_state_map = CreateMap(
    crdt_merged_df, cdc_known_or_na_state_fields_dict, cdc_known_or_na_state_title, total_cases_scale_max, map_height, map_width, 'state', 'percent'
)

(cdc_known_state_map | cdc_known_or_na_state_map).configure(
    padding={"left": 0, "top": 5, "right": 0, "bottom": 5}
).configure_view(
    strokeWidth=0,
).configure_legend(
    gradientLength=map_height - 50
).display()
In [24]:
#@title
cdc_known_county_fields_dict = {
    'x': {'name': 'total_known_cases', 'format': ',', 'title': 'Known race/ethnicity deaths'},
    'y': {'name': 'total_cases', 'format': ',', 'title': 'CDC deaths'},
    'percent': {'name': 'percent_known_cases', 'format': '.0%', 'title': 'Percent known deaths'},
}
cdc_known_county_title = 'CDC Deaths with Known Race/Ethnicity as of %s' % date_display_name
cdc_known_county_map = CreateMap(
    chart_df, cdc_known_county_fields_dict, cdc_known_county_title, total_cases_scale_max, map_height, map_width, 'county', 'percent'
)

cdc_known_or_na_county_fields_dict = {
    'x': {'name': 'total_known_or_na_cases', 'format': ',', 'title': 'Known or suppressed race/ethnicity deaths'},
    'y': {'name': 'total_cases', 'format': ',', 'title': 'CDC deaths'},
    'percent': {'name': 'percent_known_or_na_cases', 'format': '.0%', 'title': 'Percent known or suppressed deaths'},
}
cdc_known_or_na_county_title = 'CDC Deaths with Known+Suppressed Race/Ethnicity as of %s' % date_display_name
cdc_known_or_na_county_map = CreateMap(
    chart_df, cdc_known_or_na_county_fields_dict, cdc_known_or_na_county_title, total_cases_scale_max, map_height, map_width, 'county', 'percent'
)

(cdc_known_county_map | cdc_known_or_na_county_map).configure(
    padding={"left": 0, "top": 5, "right": 0, "bottom": 5}
).configure_view(
    strokeWidth=0,
).configure_legend(
    gradientLength=map_height - 50
).display()
In [25]:
#@title
#PrintSummaryStats(crdt_merged_df, field='cdc_known_cases_percent')
#PrintSummaryStats(crdt_merged_df, field='cdc_known_or_na_cases_percent')
#tuple(crdt_merged_df[crdt_merged_df.cdc_known_or_na_cases_percent <= .5].state)

Note: A larger version of the county maps for hovering over smaller counties is available in the Appendix.

States: CDC vs. CRDT

How does the CDC data compare to the CRDT data, which is the most up-to-date aggregate data we have for race/ethnicity at the state level? Overall, 93% of the deaths in the CRDT data have known race/ethnicity compared to 78% in the CDC data (81% with suppressed data).

We may even be undercounting the 93% of deaths with known race/ethnicity in the CRDT data because of the non-standard ways that each state reports on race/ethnicity, as described in this Covid Racial Data Tracker analysis. If a state uses a combined race/ethnicity field, then it's a straightforward comparison to the CDC's combined race/ethnicity field. If a state uses separate fields for race/ethnicity, then we still use the number of people with known race within each state because all of the race categories will also contain Hispanic/Latino people. We could potentially be undercounting the number of people with known race/ethnicity in the CRDT if there are people who have unknown race but known ethnicity. If we adjusted the numbers in those cases, it would make the CRDT percentages look even better in comparison to the CDC data.

In [26]:
#@title
crdt_known_state_fields_dict = {
    'x': {'name': 'crdt_known_race_cases', 'format': ',', 'title': 'Known race/ethnicity deaths'},
    'y': {'name': 'crdt_cases', 'format': ',', 'title': 'CRDT deaths'},
    'percent': {'name': 'crdt_known_race_cases_percent', 'format': '.0%', 'title': 'Percent known deaths'},
}

crdt_known_state_title = 'CRDT Deaths with Known Race/Ethnicity as of %s' % date_display_name
crdt_known_map = CreateMap(
    cdc_crdt_merged_df, crdt_known_state_fields_dict, crdt_known_state_title, total_cases_scale_max, map_height, map_width, 'state', 'percent'
)

(crdt_known_map | cdc_known_state_map).configure(
    padding={"left": 0, "top": 5, "right": 0, "bottom": 5}
).configure_view(
    strokeWidth=0,
).configure_legend(
    gradientLength=map_height - 50
).display()
In [27]:
#@title
fields_dict = {
    'x': {'name': 'crdt_known_race_cases', 'format': ',', 'title': 'CRDT known race/ethnicity deaths'},
    'y': {'name': 'cdc_known_cases', 'format': ',', 'title': 'CDC known race/ethnicity deaths'},
    'percent': {'name': 'percent_known_cases', 'format': '.2f', 'title': 'Ratio of CDC to CRDT'},
}
title = 'Ratio of CDC to CRDT Deaths with Known Race/Ethnicity as of %s' % date_display_name

CreateScatterPlotAndMap(
    crdt_merged_df, fields_dict, title, 40000, scatter_height, scatter_width, map_width - 5, 'state', 'ratio'
).display()
In [28]:
#@title
#print(crdt_merged_df.crdt_known_race_cases.sum() / crdt_merged_df.crdt_cases.sum())
#PrintSummaryStats(cdc_crdt_merged_df, field='crdt_known_race_cases_percent')

How to Improve State and County Data

How can states and counties improve their completeness for race/ethnicity data, especially when compared to the more reliable and up-to-date aggregate data that comes from public health websites, as collected by the CRDT and NYT?

There are two ways in which states can improve the data they send to the CDC:

  1. Increase the total deaths reported to get closer to the aggregate data.
  2. Increase the percentage of deaths reported with known race/ethnicity to get closer to 100%.

In the Total Death Counts section above, we identified the states and counties with the biggest discrepancies relative to aggregate data. In the Deaths with Race/Ethnicity section, we looked at the percentage of deaths within each state and county that have race/ethnicity data.

The charts below show those two components together; the scatterplots show (1) the discrepancy vs. CRDT/NYT total death counts on the y-axis, and (2) the percentage of deaths with known or suppressed race/ethnicity on the x-axis. The colors of the dots and on the map show the product of those two numbers, which is the percentage of total deaths in the CRDT/NYT that the CDC data has with known or suppressed race/ethnicity.

The scatterplots below can help us diagnose the issues in each state or county:

  • Bottom left quadrant: Low percentage of deaths reported, low reporting of race/ethnicity.
  • Top left quadrant: Mid-to-high percentage of deaths reported, low reporting of race/ethnicity.
  • Bottom right quadrant: Low percentage of deaths reported, mid-to-high reporting of race/ethnicity.
  • Top right quadrant: Mid-to-high percentage of deaths reported, mid-to-high reporting of race/ethnicity.
In [29]:
#@title
nyt_cdc_known_merged_df = chart_df.join(nyt_counties_df, on="county_fips", how='inner', lsuffix='_left', rsuffix='_right')
nyt_cdc_known_merged_df.reset_index(inplace=True)
nyt_cdc_known_merged_df['percent'] = round(nyt_cdc_known_merged_df.total_cases / nyt_cdc_known_merged_df.nyt_cases, 2)
In [66]:
#@title
crdt_merged_df['percent_max_100'] = crdt_merged_df.percent.clip(upper=1)
crdt_merged_df['percent_reccs'] = crdt_merged_df.percent_max_100 * crdt_merged_df.cdc_known_or_na_cases_percent
state_reccs_fields_dict = {
    'y': {'name': 'percent_max_100', 'format': '.0%', 'title': 'CDC percent of CRDT total deaths'},
    'x': {'name': 'cdc_known_or_na_cases_percent', 'format': '.0%', 'title': 'CDC percent with known or suppressed race/ethnicity'},
    'percent': {'name': 'percent_reccs', 'format': '.0%', 'title': 'Product: CDC percent of CRDT total with race/ethnicity'},
}
state_reccs_title = 'CDC Percent of Total Deaths x Race/Ethnicity as of %s' % date_display_name

scatter = CreateScatterPlotAndMap(
    crdt_merged_df, state_reccs_fields_dict, state_reccs_title, 1, scatter_height, scatter_width, map_width, 'state', 'percent'
)
scatter.configure_view(
    strokeWidth=0,
).configure_legend(
    gradientLength=map_height - 50
).configure_mark(
    stroke='grey'
).display()
In [67]:
#@title
nyt_cdc_known_merged_df['percent_max_100'] = nyt_cdc_known_merged_df.percent.clip(upper=1)
nyt_cdc_known_merged_df['percent_reccs'] = nyt_cdc_known_merged_df.percent_max_100 * nyt_cdc_known_merged_df.percent_known_or_na_cases
county_reccs_fields_dict = {
    'y': {'name': 'percent_max_100', 'format': '.0%', 'title': 'CDC percent of NYT total deaths'},
    'x': {'name': 'percent_known_or_na_cases', 'format': '.0%', 'title': 'CDC percent with known or suppressed race/ethnicity'},
    'percent': {'name': 'percent_reccs', 'format': '.0%', 'title': 'Product: CDC percent of NYT total with race/ethnicity'},
}
county_reccs_title = state_reccs_title = 'CDC Percent of Total Deaths x Race/Ethnicity as of %s' % date_display_name


scatter = CreateScatterPlotAndMap(
    nyt_cdc_known_merged_df, county_reccs_fields_dict, county_reccs_title, 1, scatter_height, scatter_width, map_width, 'county', 'percent'
)
scatter.configure_view(
    strokeWidth=0,
).configure_legend(
    gradientLength=map_height - 50
).configure_mark(
    stroke='grey'
).display()

Notes:

  • All states or counties with > 100% of the total deaths in the CRDT or NYT data were capped at 100%.
  • A larger version of the county map for hovering over smaller counties is available in the Appendix.
In [32]:
#@title
cdc_states_by_month_df = pd.io.gbq.read_gbq(cdc_states_by_month_query, project_id=project_id)
cdc_states_by_month_df.set_index(keys=['res_state', 'date'], inplace=True)

cdc_states_by_month_known_or_na_df = pd.io.gbq.read_gbq(cdc_states_by_month_known_or_na_query, project_id=project_id)
cdc_states_by_month_known_or_na_df.set_index(keys=['res_state', 'date'], inplace=True)

cdc_known_over_time = cdc_states_by_month_df.join(cdc_states_by_month_known_or_na_df, how='left')
cdc_known_over_time['percent_known_or_na'] = round(cdc_known_over_time.known_or_na_cases / cdc_known_over_time.total_cases, 2)
cdc_known_over_time.reset_index(inplace=True)
In [33]:
#@title
base = alt.Chart(cdc_known_over_time).mark_line(point=True).encode(
    x=alt.X('date', title='CDC earliest report date', axis=alt.Axis(labelAngle=0)),
    y=alt.Y('percent_known_or_na', title='Percent unknown or suppressed race/ethnicity', axis=alt.Axis(format='%')),
    color=alt.Color('res_state', scale=alt.Scale(scheme='category20'), title='State')
).properties(
    title='States with less than 50% of Cumulative Cases with Known or Suppressed Race/Ethnicity',
    height=map_height,
    width=map_width
)
#base.display()

Better Data from Death Certificates

The CDC has an alternative public source for death data with race/ethnicity at the state and county levels that comes from death certificates. This data is more complete than the case surveillance data in every way except that the county-level data only contains about one fourth of the counties in the CDC case data. The reason for this is that the CDC Provisional Deaths County data only includes counties with 100 or more deaths. The CDC Provisional Deaths State data also has a limit of 100 or more deaths per state, but all states had reached that threshold by Feb 3, 2021.

The table below summarizes the U.S.-level completeness of the CDC Case data vs. the CDC Provisional Deaths data for counties and states.

In [80]:
#@title
# Manually update these fields based on the latest CDC data.
row_names = [
    'Update frequency',
    'Data as of date',
    'Deaths in data as of date',
    'Deaths in CTP as of date',
    '(as a % of CTP)',
    'Number of counties',
    '(as a % of all counties)',
    'U.S. population in those counties',
    '(as a % of total U.S population – States + D.C.)',
    'Deaths with known race/ethnicity',
    '(as a % of deaths in data)',
    'Access',
    'Limitations'
]
cdc_deaths_county_metadata = [
    'Weekly',
    'Feb 3, 2021',
    '368K',
    '441K',
    '(84%)',
    '644',
    '(20%)',
    '253K',
    '(78%)',
    '362K',
    '(98%)',
    'Public',
    'Cumulative, counties >= 100 deaths'
]
cdc_deaths_state_metadata = [
    'Weekly',
    'Feb 3, 2021',
    '421K',
    '441K',
    '(96%)',
    'N/A',
    '(N/A)',
    'N/A',
    '(N/A)',
    '418K',
    '(99%)',
    'Public',
    'Cumulative'
]
cdc_cases_metadata = [
    'Monthly', 
    'Dec 16, 2020',
    '239K',
    '299K',
    '(80%)',
    '2,328',
    '(74%)',
    '278M',
    '(86%)',
    '187K',
    '(78%)',
    'Restricted',
    'Data completeness issues'
]
table_data = {'CDC Case Data': cdc_cases_metadata,
              'CDC County Provisional Deaths': cdc_deaths_county_metadata,
              'CDC State Provisional Deaths ': cdc_deaths_state_metadata,
}
metadata_df = pd.DataFrame(table_data, index=row_names)
metadata_df.head(15)
Out[80]:
CDC Case Data CDC County Provisional Deaths CDC State Provisional Deaths
Update frequency Monthly Weekly Weekly
Data as of date Dec 16, 2020 Feb 3, 2021 Feb 3, 2021
Deaths in data as of date 239K 368K 421K
Deaths in CTP as of date 299K 441K 441K
(as a % of CTP) (80%) (84%) (96%)
Number of counties 2,328 644 N/A
(as a % of all counties) (74%) (20%) (N/A)
U.S. population in those counties 278M 253K N/A
(as a % of total U.S population – States + D.C.) (86%) (78%) (N/A)
Deaths with known race/ethnicity 187K 362K 418K
(as a % of deaths in data) (78%) (98%) (99%)
Access Restricted Public Public
Limitations Data completeness issues Cumulative, counties >= 100 deaths Cumulative

The CDC Provisional data is more complete at the U.S. level on all measures except for the number of counties contained in the county data. APM Research Lab found that the CDC State Provisional Death data was actually more complete than public health website data for 12 states.

Total Death Counts

In [35]:
#@title
# Get the display names for each county.
# Use ACS data that only has one FIPS code per county unlike the fips_county_map.
provisional_date = 'DATE(2021, 2, 3)'
provisional_date_int = 20210203
provisional_date_display_name = 'Feb 3'

acs_name_lookup_query = ('''
SELECT
  state,
  county,
  county_fips,
  total_pop
FROM
  `msm-internal-data.ipums_acs.acs_2019_5year_county`
''')

cdc_provisional_deaths_county_query = ('''
SELECT
  *
FROM
  `msm-secure-data-1b.ndunlap_secure.cdc_provisional_deaths_county_20210203`
WHERE
  Indicator = "Distribution of COVID-19 deaths (%)"
''')

cdc_provisional_deaths_state_query = ('''
SELECT
  *
FROM
  `msm-secure-data-1b.ndunlap_secure.cdc_provisional_deaths_state_20210203`
WHERE
  Indicator = "Distribution of COVID-19 deaths (%)" AND
  State != 'United States'
''')

cdc_provisional_deaths_state_counts_query = ('''
SELECT
  *
FROM
  `msm-secure-data-1b.ndunlap_secure.cdc_provisional_deaths_state_20210203`
WHERE
  Indicator = "Count of COVID-19 deaths" AND
  State != 'United States'
''')

crdt_provisional_query = ('''
SELECT
  State as state,
  Deaths_Total as crdt_cases,
  Deaths_Total - Deaths_Unknown as crdt_known_race_cases,
  ROUND(1 - Deaths_Unknown / Deaths_Total, 4) as crdt_known_race_cases_percent,
FROM `msm-secure-data-1b.ndunlap_secure.crdt_20210203`
WHERE
  date = %s
''' % provisional_date_int)

nyt_counties_provisional_query = ('''
SELECT
  county_fips_code,
  deaths as nyt_cases,
FROM `bigquery-public-data.covid19_nyt.us_counties`
WHERE
  date = %s  AND
  county_fips_code IS NOT NULL
''' % provisional_date)

cdc_provisional_deaths_state_df = pd.io.gbq.read_gbq(cdc_provisional_deaths_state_query, project_id=project_id)
cdc_provisional_deaths_state_df = cdc_provisional_deaths_state_df.replace(
    to_replace={'State': states_to_abbreviations})
cdc_provisional_deaths_state_counts_df = pd.io.gbq.read_gbq(cdc_provisional_deaths_state_counts_query, project_id=project_id)
cdc_provisional_deaths_state_counts_df = cdc_provisional_deaths_state_counts_df.replace(
    to_replace={'State': states_to_abbreviations})
cdc_provisional_deaths_state_df['cdc_known_cases_percent'] = round((
    cdc_provisional_deaths_state_df.Non_Hispanic_White.fillna(0) +
    cdc_provisional_deaths_state_df.Non_Hispanic_Black_or_African_American.fillna(0) +
    cdc_provisional_deaths_state_df.Non_Hispanic_American_Indian_or_Alaska_Native.fillna(0) +
    cdc_provisional_deaths_state_df.Non_Hispanic_Asian.fillna(0) +
    cdc_provisional_deaths_state_df.Non_Hispanic_Native_Hawaiian_or_Other_Pacific_Islander.fillna(0) +
    cdc_provisional_deaths_state_df.Hispanic_or_Latino.fillna(0)) / 100, 4)
cdc_provisional_deaths_state_counts_df['cdc_known_cases'] = round(
    cdc_provisional_deaths_state_counts_df.Non_Hispanic_White.fillna(0) +
    cdc_provisional_deaths_state_counts_df.Non_Hispanic_Black_or_African_American.fillna(0) +
    cdc_provisional_deaths_state_counts_df.Non_Hispanic_American_Indian_or_Alaska_Native.fillna(0) +
    cdc_provisional_deaths_state_counts_df.Non_Hispanic_Asian.fillna(0) +
    cdc_provisional_deaths_state_counts_df.Non_Hispanic_Native_Hawaiian_or_Other_Pacific_Islander.fillna(0) +
    cdc_provisional_deaths_state_counts_df.Hispanic_or_Latino.fillna(0))
cdc_provisional_deaths_state_df = cdc_provisional_deaths_state_df.set_index('State')
cdc_provisional_deaths_state_counts_df = cdc_provisional_deaths_state_counts_df.set_index('State')
cdc_provisional_deaths_state_df = cdc_provisional_deaths_state_df.join(
    cdc_provisional_deaths_state_counts_df, on="State", how='inner', lsuffix='_left', rsuffix='_right')
# Extrapolate from the % known race/ethnicity cases and their counts to the total case count for the state.
cdc_provisional_deaths_state_df['cdc_cases'] = round(
    cdc_provisional_deaths_state_df.cdc_known_cases / cdc_provisional_deaths_state_df.cdc_known_cases_percent, 0)
# Comnbine the case counts for NY and New York City, recalculate percentage, remove New York City. 
cdc_provisional_deaths_state_df.loc['NY', 'cdc_cases'] = (
    cdc_provisional_deaths_state_df.loc['NY', 'cdc_cases'] +
    cdc_provisional_deaths_state_df.loc['New York City', 'cdc_cases'])
cdc_provisional_deaths_state_df.loc['NY', 'cdc_known_cases'] = (
    cdc_provisional_deaths_state_df.loc['NY', 'cdc_known_cases'] +
    cdc_provisional_deaths_state_df.loc['New York City', 'cdc_known_cases'])
cdc_provisional_deaths_state_df.loc['NY', 'cdc_known_cases_percent'] = round(
    cdc_provisional_deaths_state_df.loc['NY', 'cdc_known_cases'] /
    cdc_provisional_deaths_state_df.loc['NY', 'cdc_cases'], 4)
cdc_provisional_deaths_state_df.drop(['New York City'], inplace=True)
cdc_provisional_deaths_state_df.reset_index(inplace=True)
cdc_provisional_deaths_state_df.rename(columns={'State': 'state'}, inplace=True)
cdc_provisional_deaths_state_df = cdc_provisional_deaths_state_df[
    ['state', 'cdc_cases', 'cdc_known_cases', 'cdc_known_cases_percent']].copy()

crdt_provisional_df = pd.io.gbq.read_gbq(crdt_provisional_query, project_id=project_id)
crdt_provisional_df.set_index('state', inplace=True)
cdc_provisional_crdt_merged_df = cdc_provisional_deaths_state_df.join(crdt_provisional_df, on="state", how='right', lsuffix='_left', rsuffix='_right')
cdc_provisional_crdt_merged_df.reset_index(inplace=True)
cdc_provisional_crdt_merged_df['state_fips_code'] = cdc_provisional_crdt_merged_df.state
cdc_provisional_crdt_merged_df = cdc_provisional_crdt_merged_df.replace(to_replace={'state_fips_code': states_to_fips})
cdc_provisional_crdt_merged_df['percent'] = round(cdc_provisional_crdt_merged_df.cdc_cases / cdc_provisional_crdt_merged_df.crdt_cases, 4)
cdc_provisional_crdt_merged_df['percent_known_cases'] = round(cdc_provisional_crdt_merged_df.cdc_known_cases / cdc_provisional_crdt_merged_df.crdt_known_race_cases, 4)

# Counties data

df_acs_name_lookup = pd.io.gbq.read_gbq(acs_name_lookup_query, project_id=project_id)
df_acs_name_lookup.set_index('county_fips', inplace=True)

cdc_provisional_deaths_county_df = pd.io.gbq.read_gbq(cdc_provisional_deaths_county_query, project_id=project_id)

cdc_provisional_deaths_county_df['county_fips'] = cdc_provisional_deaths_county_df.FIPS_Code
cdc_provisional_deaths_county_df.set_index('county_fips', inplace=True)
cdc_provisional_deaths_county_df['state_county'] = cdc_provisional_deaths_county_df.County_Name + ', ' + cdc_provisional_deaths_county_df.State
cdc_provisional_deaths_county_df['total_known_cases'] = round((
    cdc_provisional_deaths_county_df.Non_Hispanic_White.fillna(0) +
    cdc_provisional_deaths_county_df.Non_Hispanic_Black.fillna(0) +
    cdc_provisional_deaths_county_df.Non_Hispanic_American_Indian_or_Alaska_Native.fillna(0) +
    cdc_provisional_deaths_county_df.Non_Hispanic_Asian.fillna(0) +
    cdc_provisional_deaths_county_df.Non_Hispanic_Native_Hawaiian_or_Other_Pacific_Islander.fillna(0) +
    cdc_provisional_deaths_county_df.Hispanic.fillna(0)) * cdc_provisional_deaths_county_df.COVID_19_Deaths, 0)

county_chart_provisional_df = cdc_provisional_deaths_county_df.join(df_acs_name_lookup, on="county_fips", how='inner', lsuffix='_left', rsuffix='_right')
county_chart_provisional_df.reset_index(inplace=True)
county_chart_provisional_df.county_fips = county_chart_provisional_df.county_fips.astype(int)

nyt_counties_provisional_df = pd.io.gbq.read_gbq(nyt_counties_provisional_query, project_id=project_id)
nyt_counties_provisional_df.rename(columns={'county_fips_code': 'county_fips'}, inplace=True)
nyt_counties_provisional_df.county_fips.unique()
nyt_counties_provisional_df['county_fips'] = nyt_counties_provisional_df.county_fips.astype(int)
nyt_counties_provisional_df.set_index('county_fips', inplace=True)

county_chart_provisional_df.set_index('county_fips', inplace=True)
nyt_merged_provisional_df = county_chart_provisional_df.join(nyt_counties_provisional_df, on="county_fips", how='left', lsuffix='_left', rsuffix='_right')
nyt_merged_provisional_df = nyt_merged_provisional_df.reset_index()
nyt_merged_provisional_df['percent'] = round(nyt_merged_provisional_df.COVID_19_Deaths / nyt_merged_provisional_df.nyt_cases, 4)
nyt_merged_provisional_df['percent_known_cases'] = round(nyt_merged_provisional_df.total_known_cases / nyt_merged_provisional_df.COVID_19_Deaths, 4)

#print('total deaths: ', nyt_merged_provisional_df.COVID_19_Deaths.sum())
#print('total counties: ', len(county_chart_provisional_df))
#print('as % of counties: ', len(county_chart_provisional_df) / 3143)
#print('population in counties: ', county_chart_provisional_df.total_pop.sum())
#print('as % of total population: ', county_chart_provisional_df.total_pop.sum() / 324697795)  # Population covered in these counties
#print('known race/ethnicity: ', nyt_merged_provisional_df.total_known_cases.sum())
#print('% known race/ethnicity: ', nyt_merged_provisional_df.total_known_cases.sum() / nyt_merged_provisional_df.COVID_19_Deaths.sum())
#print(0.55 * 324697795) # NYT population
#print('state file deaths: ', cdc_provisional_crdt_merged_df.cdc_cases.sum())
#print('state file known race/ethnicity: ', cdc_provisional_crdt_merged_df.cdc_known_cases.sum())
#print('state file % known race/ethnicity: ', cdc_provisional_crdt_merged_df.cdc_known_cases.sum() / cdc_provisional_crdt_merged_df.cdc_cases.sum())
In [51]:
#@title
cdc_crdt_fields_dict = {
    'x': {'name': 'crdt_cases', 'format': ',', 'title': 'CRDT deaths'},
    'y': {'name': 'cdc_cases', 'format': ',', 'title': 'CDC deaths'},
    'percent': {'name': 'percent', 'format': '.2f', 'title': 'Ratio of CDC to CRDT'},
}
cdc_crdt_title = 'Ratio of CDC Provisional Deaths to CRDT Deaths as of %s' % provisional_date_display_name

CreateScatterPlotAndMap(
    cdc_provisional_crdt_merged_df, cdc_crdt_fields_dict, cdc_crdt_title, total_cases_scale_max, scatter_height, scatter_width, map_width, 'state', 'ratio'
).display()
In [52]:
#@title
cdc_nyt_provisional_fields_dict = {
    'x': {'name': 'nyt_cases', 'format': ',', 'title': 'NYT deaths'},
    'y': {'name': 'COVID_19_Deaths', 'format': ',', 'title': 'CDC deaths'},
    'percent': {'name': 'percent', 'format': '.2f', 'title': 'Ratio of CDC to NYT'},
}
cdc_nyt_provisional_title = 'Ratio of CDC Provisional Deaths to NYT Deaths as of %s' % provisional_date_display_name

scatter = CreateScatterPlotAndMap(
    nyt_merged_provisional_df, cdc_nyt_provisional_fields_dict, cdc_nyt_provisional_title, 18000, scatter_height, scatter_width, map_width, 'county', 'ratio'
)

(scatter).configure_view(
    strokeWidth=0,
).configure_legend(
    gradientLength=map_height - 50
).configure_mark(
    stroke='grey'
).display()

Deaths with Race/Ethnicity

In the charts and maps below, we can see that the CDC Provisional deaths data has a strikingly high percentage of deaths with known race/ethnicity. The state data had both counts and percentages of deaths from COVID-19 within each race/ethncity group but not a total deaths count. There is an "Other" category that includes More than one race or Unknown, and we treated that entire category as Unknown race/ethnicity. The percentages for all the categories including Other didn't always add up to 100%, so we treated the remaining percentage as Unknowns as well (data for a group can be suppressed if that group had fewer than 10 deaths). The county data included only percentages of deaths from COVID-19 within each race/ethnicity group, but it did include a total count per county so that we could calculate counts.

In [53]:
#@title

crdt_provisional_known_state_fields_dict = {
    'x': {'name': 'crdt_known_race_cases', 'format': ',', 'title': 'Known race/ethnicity deaths'},
    'y': {'name': 'crdt_cases', 'format': ',', 'title': 'CRDT deaths'},
    'percent': {'name': 'crdt_known_race_cases_percent', 'format': '.0%', 'title': 'Percent known deaths'},
}

crdt_provisional_known_state_title = 'CRDT Deaths with Known Race/Ethnicity as of %s' % provisional_date_display_name
crdt_provisional_known_map = CreateMap(
    cdc_provisional_crdt_merged_df, crdt_provisional_known_state_fields_dict, crdt_provisional_known_state_title, total_cases_scale_max, map_height, map_width, 'state', 'percent'
)
cdc_provisional_known_state_fields_dict = {
    'x': {'name': 'cdc_known_cases', 'format': ',', 'title': 'Known race/ethnicity deaths'},
    'y': {'name': 'cdc_cases', 'format': ',', 'title': 'CDC deaths'},
    'percent': {'name': 'cdc_known_cases_percent', 'format': '.0%', 'title': 'Percent known deaths'},
}

cdc_provisional_known_state_title = 'CDC Provisional Deaths with Known Race/Ethnicity as of %s' % provisional_date_display_name
cdc_provisional_known_state_map = CreateMap(
    cdc_provisional_crdt_merged_df, cdc_provisional_known_state_fields_dict, cdc_provisional_known_state_title, total_cases_scale_max, map_height, map_width, 'state', 'percent'
)

(crdt_provisional_known_map | cdc_provisional_known_state_map).configure(
    padding={"left": 0, "top": 5, "right": 0, "bottom": 5}
).configure_view(
    strokeWidth=0,
).configure_legend(
    gradientLength=map_height - 50
).display()
In [59]:
#@title
fields_dict = {
    'x': {'name': 'crdt_known_race_cases', 'format': ',', 'title': 'CRDT known race/ethnicity deaths'},
    'y': {'name': 'cdc_known_cases', 'format': ',', 'title': 'CDC known race/ethnicity deaths'},
    'percent': {'name': 'percent_known_cases', 'format': '.2f', 'title': 'Ratio of CDC to CRDT'},
}
title = 'Ratio of CDC Provisional to CRDT Deaths with Known Race/Ethnicity'

CreateScatterPlotAndMap(
    cdc_provisional_crdt_merged_df, fields_dict, title, 40000, scatter_height, scatter_width, map_width - 5, 'state', 'ratio'
).display()
In [56]:
#@title
cdc_known_county_provisional_fields_dict = {
    'x': {'name': 'total_known_cases', 'format': ',', 'title': 'Known race/ethnicity deaths'},
    'y': {'name': 'COVID_19_Deaths', 'format': ',', 'title': 'CDC deaths'},
    'percent': {'name': 'percent_known_cases', 'format': '.0%', 'title': 'Percent known deaths'},
}
cdc_known_county_provisional_title = 'CDC Provisional Deaths with Known Race/Ethnicity as of %s' % provisional_date_display_name
cdc_known_county_provisional_map = CreateMap(
    nyt_merged_provisional_df, cdc_known_county_provisional_fields_dict, cdc_known_county_provisional_title, total_cases_scale_max, map_height, map_width, 'county', 'percent'
)

(cdc_known_county_provisional_map).configure_view(
    strokeWidth=0,
).configure_legend(
    gradientLength=map_height - 50
).display()

We can look at the composite measure of how well each county reports on total deaths from CRDT/NYT and how many of those deaths have race/ethnicity data. In the CDC Provisional Deaths data, almost all states and counties are in the top right quadrant, which means that they have a mid-high percentage of total deaths and a mid-high percentage of deaths with race/ethnicity.

In [75]:
#@title
cdc_provisional_crdt_merged_df['percent_max_100'] = cdc_provisional_crdt_merged_df.percent.clip(upper=1)
cdc_provisional_crdt_merged_df['percent_reccs'] = cdc_provisional_crdt_merged_df.percent_max_100 * cdc_provisional_crdt_merged_df.cdc_known_cases_percent
provisional_state_reccs_fields_dict = {
    'y': {'name': 'percent_max_100', 'format': '.0%', 'title': 'CDC percent of CRDT total deaths'},
    'x': {'name': 'cdc_known_cases_percent', 'format': '.0%', 'title': 'CDC percent with known or suppressed race/ethnicity'},
    'percent': {'name': 'percent_reccs', 'format': '.0%', 'title': 'Product: CDC percent of CRDT total with race/ethnicity'},
}
provisional_state_reccs_title = 'CDC Provisional Percent of Total Deaths x Race/Ethnicity as of %s' % provisional_date_display_name

scatter = CreateScatterPlotAndMap(
    cdc_provisional_crdt_merged_df, provisional_state_reccs_fields_dict, provisional_state_reccs_title, 1, scatter_height, scatter_width, map_width, 'state', 'percent'
)
scatter.configure_view(
    strokeWidth=0,
).configure_legend(
    gradientLength=map_height - 50
).configure_mark(
    stroke='grey'
).display()
In [76]:
#@title
nyt_merged_provisional_df['percent_max_100'] = nyt_merged_provisional_df.percent.clip(upper=1)
nyt_merged_provisional_df['percent_reccs'] = nyt_merged_provisional_df.percent_max_100 * nyt_merged_provisional_df.percent_known_cases
county_reccs_provisional_fields_dict = {
    'y': {'name': 'percent_max_100', 'format': '.0%', 'title': 'CDC percent of NYT total deaths'},
    'x': {'name': 'percent_known_cases', 'format': '.0%', 'title': 'CDC percent with race/ethnicity'},
    'percent': {'name': 'percent_reccs', 'format': '.0%', 'title': 'Product: CDC percent of NYT total with race/ethnicity'},
}
county_reccs_provisional_title = state_reccs_title = 'CDC Provisional Percent of Total Deaths x Race/Ethnicity as of %s' % provisional_date_display_name

scatter_provisional = CreateScatterPlotAndMap(
    nyt_merged_provisional_df, county_reccs_provisional_fields_dict, county_reccs_provisional_title, 1, scatter_height, scatter_width, map_width, 'county', 'percent'
)
scatter_provisional.configure_view(
    strokeWidth=0,
).configure_legend(
    gradientLength=map_height - 50
).configure_mark(
    stroke='grey'
).display()

Notes:

  • All states or counties with > 100% of the total deaths in the CRDT or NYT data were capped at 100%.
  • A larger version of the county map for hovering over smaller counties is available in the Appendix.

If you are looking for state- or county-level deaths data with race/ethnicity, the CDC Provisional Deaths data is more complete than the CDC Case Surveillance data in almost every way: 98%-99% of the deaths reported have race/ethnicity, it generally does not lag far behind state website data as reported in the CRDT/NYT, and it is updated once a week rather than once a month. However, there are a few tradeoffs:

  • Only 20% of counties are included in the county data even though those counties account for 78% of the U.S. population. The county data should not be used at the state or U.S. levels due to the exclusion of counties with fewer than 100 COVID-19 deaths, which are generally counties with small populations. The county data could be used for trends across the U.S. in counties with large populations.
  • The data is cumulative, so you will not be able to analyze deaths over time unless you collect the weekly updates from the CDC.
  • Age and race data is available at the state level only in a separate dataset. All the provisional deaths datasets are listed at the CDC site.

Appendix

Large county maps

To make it easier to hover over small counties, here are larger versions of the county maps that appeared in this report.

In [43]:
#@title
cdc_nyt_map = CreateMap(
    nyt_merged_df, cdc_nyt_fields_dict, cdc_nyt_title, total_cases_scale_max, map_height * 2, map_width * 2, 'county', 'ratio'
).configure_view(
    strokeWidth=0,
).configure_legend(
    gradientLength=map_height - 50
)
cdc_nyt_map.display()
In [44]:
#@title
cdc_known_county_map = CreateMap(
    chart_df, cdc_known_county_fields_dict, cdc_known_county_title, total_cases_scale_max, map_height * 2, map_width * 2, 'county', 'percent'
).configure_view(
    strokeWidth=0,
).configure_legend(
    gradientLength=map_height - 50
)

cdc_known_county_map.display()
In [45]:
#@title
cdc_known_or_na_county_map = CreateMap(
    chart_df, cdc_known_or_na_county_fields_dict, cdc_known_or_na_county_title, total_cases_scale_max, map_height * 2, map_width * 2, 'county', 'percent'
).configure_view(
    strokeWidth=0,
).configure_legend(
    gradientLength=map_height - 50
)
cdc_known_or_na_county_map.display()
In [70]:
#@title
county_completeness = CreateMap(
    nyt_cdc_known_merged_df, county_reccs_fields_dict, county_reccs_title, 1, map_height * 2, map_width * 2, 'county', 'percent'
)
county_completeness.configure_view(
    strokeWidth=0,
).configure_legend(
    gradientLength=map_height - 50
).configure_mark(
    stroke='grey'
).display()
In [62]:
#@title
cdc_nyt_map = CreateMap(
    nyt_merged_provisional_df, cdc_nyt_provisional_fields_dict, cdc_nyt_provisional_title, total_cases_scale_max, map_height * 2, map_width * 2, 'county', 'ratio'
)

cdc_nyt_map.configure_view(
    strokeWidth=0,
).configure_legend(
    gradientLength=map_height - 50
).configure_mark(
    stroke='grey'
).display()
In [63]:
#@title
cdc_known_county_map = CreateMap(
    nyt_merged_provisional_df, cdc_known_county_provisional_fields_dict, cdc_known_county_provisional_title, total_cases_scale_max, map_height * 2, map_width * 2, 'county', 'percent'
)

cdc_known_county_map.configure_view(
    strokeWidth=0,
).configure_legend(
    gradientLength=map_height - 50
).configure_mark(
    stroke='grey'
).display()
In [77]:
#@title
map = CreateMap(
    nyt_merged_provisional_df, county_reccs_provisional_fields_dict, county_reccs_provisional_title, 1, map_height * 2, map_width * 2, 'county', 'percent'
)

map.configure_view(
    strokeWidth=0,
).configure_legend(
    gradientLength=map_height - 50
).configure_mark(
    stroke='grey'
).display()

Geographic Notes

The CDC Provisional State Death data had separate rows for New York State and New York City; we combined them into New York State.

Data Citations and Disclaimers

  • CDC data full citation: Centers for Disease Control and Prevention, COVID-19 Response. COVID-19 Case Surveillance Data Access, Summary, and Limitations (version date: December 31, 2020).
  • Per the CDC data agreement: The CDC does not take responsibility for the scientific validity or accuracy of methodology, results, statistical analyses, or conclusions presented.
  • CDC Provisional Deaths Data: State, County, and dataset index. National Center for Health Statistics (NCHS), National Vital Statistics System. Estimates are based on provisional data.
  • Population data: U.S. Census Bureau's American Community Survey 2019 5-year estimates accessed via API; e.g., sample query.
  • Covid Racial Data Tracker data: Available in a public spreadsheet.
  • New York Times data: Available as a public CSV file or via Google Cloud Public Datasets.

Contact information

Please email us at shli-covid-data-analysis@googlegroups.com with questions or comments.

In [ ]:
#%%shell
#jupyter nbconvert --to html 'cdc_death_data.ipynb'